This workflow fits a model across all of CONUS that predicts whether a location does or does not have trees.

The data consists of vegetation % cover by functional group from across CONUS (from AIM, FIA, LANDFIRE, and RAP), as well as climate variables from DayMet, which have been aggregated into mean interannual conditions accross multiple temporal windows.

Dependencies

Set user defined parameters

print(params)
## $run
## [1] FALSE
## 
## $save_figs
## [1] TRUE
## 
## $ecoregion
## [1] "CONUS"
## 
## $response
## [1] "TotalTreeCover"
## 
## $treeThreshold
## [1] 10
## 
## $removeTexasLouisianaPlain
## [1] FALSE
## 
## $whichSecondBestMod
## [1] "auto"
## 
## $thresholdMethod
## [1] "MaxKappa"
# set to true if want to run for a limited number of rows (i.e. for code testing)
test_run <- params$test_run
save_figs <- params$save_figs
response <- params$response
fit_sample <- TRUE # fit model to a sample of the data
n_train <- 5e4 # sample size of the training data
n_test <- 1e6 # sample size of the testing data (if this is too big the decile dotplot code throws memory errors)
removeTLP <- params$removeTexasLouisianaPlain
run <- params$run
whichSecondBestMod <- params$whichSecondBestMod
treeThreshold <- params$treeThreshold
thresholdMethod <- params$thresholdMethod

Load packages and source functions

# set option so resampled dataset created here reproduces earlier runs of this code with dplyr 1.0.10
source("../../../Functions/glmTransformsIterates.R")
source("../../../Functions/transformPreds.R")
source("../../../Functions/betaLASSO.R")

#source("../../../Functions/StepBeta_mine.R")
#source("src/fig_params.R")
#source("src/modeling_functions.R")
 
library(betareg)
library(ggspatial)
library(terra)
library(tidyterra)
library(sf)
library(caret)
library(tidyverse)
library(GGally) # for ggpairs()
library(pdp) # for partial dependence plots
library(gridExtra)
library(knitr)
library(patchwork) # for figure insets etc. 
library(ggtext)
library(StepBeta)
theme_set(theme_classic())
library(here)
library(rsample)
library(kableExtra)
library(glmnet)
library(USA.state.boundaries)
library(cvms)
library(rsvg)
library(ggimage)
library(doMC)
library(parallel)
library(pROC)

Read in data

Data compiled in the prepDataForModels.R script

here::i_am("Analysis/VegComposition/ModelFitting/02_ModelFitting_globalTreeModel.Rmd")
modDat <- readRDS( here("Data_processed", "CoverData", "DataForModels_spatiallyAveraged_withSoils_noSf_sampledLANDFIRE.rds")) %>% st_drop_geometry()

Prep data

We will fit a binomial model that predicts whether or not there are trees at a location. Because the tree cover data we have is continuous between 0 and 100, we convert it to be binomial be forcing any values ≤ % to be 0, and any values > % to be 1.

modDat <- modDat %>% 
  mutate(TotalTreeCover_binom = replace(TotalTreeCover, TotalTreeCover <=treeThreshold, 0)) %>% 
  mutate(TotalTreeCover_binom = replace(TotalTreeCover_binom, TotalTreeCover_binom > treeThreshold, 1))
set.seed(1234)
# now, rename columns for brevity
modDat_1 <- modDat %>% 
  dplyr::select(-c(prcp_annTotal:annVPD_min)) %>% 
  # mutate(Lon = st_coordinates(.)[,1], 
  #        Lat = st_coordinates(.)[,2])  %>% 
  # st_drop_geometry() %>% 
  # filter(!is.na(newRegion))
  rename("tmin" = tmin_meanAnnAvg_CLIM, 
     "tmax" = tmax_meanAnnAvg_CLIM, #1
     "tmean" = tmean_meanAnnAvg_CLIM, 
     "prcp" = prcp_meanAnnTotal_CLIM, 
     "t_warm" = T_warmestMonth_meanAnnAvg_CLIM,
     "t_cold" = T_coldestMonth_meanAnnAvg_CLIM, 
     "prcp_wet" = precip_wettestMonth_meanAnnAvg_CLIM,
     "prcp_dry" = precip_driestMonth_meanAnnAvg_CLIM, 
     "prcp_seasonality" = precip_Seasonality_meanAnnAvg_CLIM, #2
     "prcpTempCorr" = PrecipTempCorr_meanAnnAvg_CLIM,  #3
     "abvFreezingMonth" = aboveFreezing_month_meanAnnAvg_CLIM, 
     "isothermality" = isothermality_meanAnnAvg_CLIM, #4
     "annWatDef" = annWaterDeficit_meanAnnAvg_CLIM, 
     "annWetDegDays" = annWetDegDays_meanAnnAvg_CLIM,
     "VPD_mean" = annVPD_mean_meanAnnAvg_CLIM, 
     "VPD_max" = annVPD_max_meanAnnAvg_CLIM, #5
     "VPD_min" = annVPD_min_meanAnnAvg_CLIM, #6
     "VPD_max_95" = annVPD_max_95percentile_CLIM, 
     "annWatDef_95" = annWaterDeficit_95percentile_CLIM, 
     "annWetDegDays_5" = annWetDegDays_5percentile_CLIM, 
     "frostFreeDays_5" = durationFrostFreeDays_5percentile_CLIM, 
     "frostFreeDays" = durationFrostFreeDays_meanAnnAvg_CLIM, 
     "soilDepth" = soilDepth, #7
     "clay" = surfaceClay_perc, 
     "sand" = avgSandPerc_acrossDepth, #8
     "coarse" = avgCoarsePerc_acrossDepth, #9
     "carbon" = avgOrganicCarbonPerc_0_3cm, #10
     "AWHC" = totalAvailableWaterHoldingCapacity,
     ## anomaly variables
     tmean_anom = tmean_meanAnnAvg_3yrAnom, #15
     tmin_anom = tmin_meanAnnAvg_3yrAnom, #16
     tmax_anom = tmax_meanAnnAvg_3yrAnom, #17
    prcp_anom = prcp_meanAnnTotal_3yrAnom, #18
      t_warm_anom = T_warmestMonth_meanAnnAvg_3yrAnom,  #19
     t_cold_anom = T_coldestMonth_meanAnnAvg_3yrAnom, #20
      prcp_wet_anom = precip_wettestMonth_meanAnnAvg_3yrAnom, #21
      precp_dry_anom = precip_driestMonth_meanAnnAvg_3yrAnom,  #22
    prcp_seasonality_anom = precip_Seasonality_meanAnnAvg_3yrAnom, #23 
     prcpTempCorr_anom = PrecipTempCorr_meanAnnAvg_3yrAnom, #24
      aboveFreezingMonth_anom = aboveFreezing_month_meanAnnAvg_3yrAnom, #25  
    isothermality_anom = isothermality_meanAnnAvg_3yrAnom, #26
       annWatDef_anom = annWaterDeficit_meanAnnAvg_3yrAnom, #27
     annWetDegDays_anom = annWetDegDays_meanAnnAvg_3yrAnom,  #28
      VPD_mean_anom = annVPD_mean_meanAnnAvg_3yrAnom, #29
      VPD_min_anom = annVPD_min_meanAnnAvg_3yrAnom,  #30
      VPD_max_anom = annVPD_max_meanAnnAvg_3yrAnom,  #31
     VPD_max_95_anom = annVPD_max_95percentile_3yrAnom, #32
      annWatDef_95_anom = annWaterDeficit_95percentile_3yrAnom, #33 
      annWetDegDays_5_anom = annWetDegDays_5percentile_3yrAnom ,  #34
    frostFreeDays_5_anom = durationFrostFreeDays_5percentile_3yrAnom, #35 
      frostFreeDays_anom = durationFrostFreeDays_meanAnnAvg_3yrAnom #36
  ) %>% 
  dplyr::select(-c(tmin_meanAnnAvg_3yr:durationFrostFreeDays_meanAnnAvg_3yr))

Predictors

The following are the candidate predictor variables for this ecoregion:

  prednames <- c(
      "tmean",
      "prcp",
      "prcp_dry", 
      "prcp_seasonality",
      "prcpTempCorr", 
      "isothermality", 
      "annWetDegDays",
      "annWatDef_95", 
      "clay",
      "coarse",
      "AWHC" 
 
 #"tmean", "prcp", "prcp_seasonality", "prcpTempCorr", "isothermality", "annWetDegDays", "sand", "coarse", "AWHC"
  )

print(prednames)
##  [1] "tmean"            "prcp"             "prcp_dry"         "prcp_seasonality"
##  [5] "prcpTempCorr"     "isothermality"    "annWetDegDays"    "annWatDef_95"    
##  [9] "clay"             "coarse"           "AWHC"

Scale the predictor variables for the model-fitting process

allPreds <- modDat_1 %>% 
  dplyr::select(tmin:frostFreeDays,tmean_anom:frostFreeDays_anom, soilDepth:AWHC) %>% 
  names()
modDat_1_s <- modDat_1 %>% 
  mutate(across(all_of(allPreds), base::scale, .names = "{.col}_s")) 
saveRDS(modDat_1_s, file = "./models/scaledModelInputData.rds")

# Remove the rows that have no observations for total tree cover
modDat_1_s <- modDat_1_s[!is.na(modDat_1_s[,"TotalTreeCover_binom"]),]

# subset the data to only include these predictors, and remove any remaining NAs 
modDat_1_s <- modDat_1_s %>% 
  dplyr::select(prednames, paste0(prednames, "_s"), TotalTreeCover, TotalTreeCover_binom, newRegion, Year, x, y, NA_L1NAME, NA_L2NAME) %>% 
  drop_na()

names(prednames) <- prednames
df_pred <- modDat_1_s[, prednames]

response <- "TotalTreeCover"

Visualize the response variable

ggplot(modDat_1_s) + 
  geom_histogram(aes(TotalTreeCover/100), fill = "darkgreen", col = "darkgreen", alpha = .5) + 
  xlab("Tree Cover") + 
  ggtitle("Untransformed, observed tree cover")

ggplot(modDat_1_s) + 
  geom_histogram(aes(TotalTreeCover_binom), fill = "purple", alpha = .5, col = "purple") +
  xlab("Tree Cover") + 
  ggtitle(paste0("Tree cover, converted to binomial with a ",treeThreshold,"cutoff"))

create_summary <- function(df) {
  df %>% 
    pivot_longer(cols = everything(),
                 names_to = 'variable') %>% 
    group_by(variable) %>% 
    summarise(across(value, .fns = list(mean = ~mean(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE), 
                                        median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE)))) %>% 
    mutate(across(where(is.numeric), round, 4))
}

modDat_1_s[prednames] %>% 
  create_summary() %>% 
  knitr::kable(caption = 'summaries of possible predictor variables') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
summaries of possible predictor variables
variable value_mean value_min value_median value_max
AWHC 14.8606 0.1424 14.2535 35.2881
annWatDef_95 164.3643 0.0000 135.6146 601.2686
annWetDegDays 1854.7754 81.2108 1608.8952 7131.5166
clay 19.7378 0.1687 18.6720 83.0975
coarse 9.3788 0.0000 5.4986 79.9649
isothermality 37.6851 19.4935 37.7091 63.7425
prcp 483.0393 47.6797 399.3619 4360.3490
prcpTempCorr 0.0474 -0.8613 0.1239 0.7098
prcp_dry 3.2466 0.0000 1.6926 81.4306
prcp_seasonality 0.9725 0.3568 0.9379 2.2319
tmean 11.1504 -2.2524 10.0136 24.6330

Histograms of raw and scaled predictors

scaleFigDat_1 <- modDat_1_s %>% 
  dplyr::select(c(x, y, Year, prednames)) %>% 
  pivot_longer(cols = all_of(names(prednames)), 
               names_to = "predNames", 
               values_to = "predValues_unScaled")
scaleFigDat_2 <- modDat_1_s %>% 
  dplyr::select(c(x, y, Year, paste0(prednames, "_s"))) %>% 
  pivot_longer(cols = all_of(paste0(prednames,"_s"
                                    )), 
               names_to = "predNames", 
               values_to = "predValues_scaled", 
               names_sep = ) %>% 
  mutate(predNames = str_replace(predNames, pattern = "_s$", replacement = ""))

scaleFigDat_3 <- scaleFigDat_1 %>% 
  left_join(scaleFigDat_2)

ggplot(scaleFigDat_3) + 
  facet_wrap(~predNames, scales = "free") +
  geom_histogram(aes(predValues_unScaled), fill = "lightgrey", col = "darkgrey") + 
  geom_histogram(aes(predValues_scaled), fill = "lightblue", col = "blue") +
  xlab ("predictor variable values") + 
  ggtitle("Comparing the distribution of unscaled (grey) to scaled (blue) predictor variables")

modDat_1_s <- modDat_1_s %>% 
  rename_with(~paste0(.x, "_raw"), 
                any_of(names(prednames))) %>% 
  rename_with(~str_remove(.x, "_s$"), 
              any_of(paste0(names(prednames), "_s")))

Predictor variables compared to binned response variables

set.seed(12011993)
# vector of name of response variables
vars_response <- response
# longformat dataframes for making boxplots
df_sample_plots <-  modDat_1_s  %>% 
  slice_sample(n = 5e4) %>% 
   rename(response = all_of("TotalTreeCover_binom")) %>% 
  mutate(response = case_when(
    response == 0 ~ "0", 
    response > 0  ~ "1", 
  )) %>% 
  dplyr::select(c(response, prednames)) %>% 
  tidyr::pivot_longer(cols = unname(prednames), 
               names_to = "predictor", 
               values_to = "value"
               )  
 

  ggplot(df_sample_plots, aes_string(x= "response", y = 'value')) +
  geom_boxplot() +
  facet_wrap(~predictor , scales = 'free_y') + 
  ylab("Predictor Variable Values") + 
    xlab(response)

If desired, split the data into training and test data (just leave one year out for now–somewhat randomly chose 2019, 2003, and 2012 )

# # # do a pca of climate across years 
# pca_yrs <- prcomp(modDat_1_s[,paste0(prednames)])
# # library(factoextra)
# (fviz_pca_ind(pca_yrs, habillage = modDat_1_s$Year, label = "none", addEllipses = TRUE, ellipse.level = .95, ggtheme = theme_minimal(), alpha.ind = .1))
# 
# ggplot(modDat_1_s) + 
#   geom_point(aes(modDat_1_s$Year, modDat_1_s$TotalTreeCover)) + 
#   geom_smooth(aes(modDat_1_s$Year, modDat_1_s$TotalTreeCover))

# save test set
modDat_1_sTest <- modDat_1_s[modDat_1_s$Year %in% c(2003, 2012, 2019),]
modDat_1_s <- modDat_1_s[!modDat_1_s$Year %in% c(2003, 2012, 2019),]

Model Fitting

Visualize the spatial blocks and how they differ across environmental space

First, if there are observations in Louisiana, sub-sample them so they’re not so over-represented in the dataset

## make data into spatial format
modDat_1_sf <- modDat_1_s %>% 
  st_as_sf(coords = c("x", "y"), crs = st_crs("EPSG:4326"))


# download map info for visualization
data(state_boundaries_wgs84) 

cropped_states <- suppressMessages(state_boundaries_wgs84 %>%
  dplyr::filter(NAME!="Hawaii") %>%
  dplyr::filter(NAME!="Alaska") %>%
  dplyr::filter(NAME!="Puerto Rico") %>%
  dplyr::filter(NAME!="American Samoa") %>%
  dplyr::filter(NAME!="Guam") %>%
  dplyr::filter(NAME!="Commonwealth of the Northern Mariana Islands") %>%
  dplyr::filter(NAME!="United States Virgin Islands") %>%
  sf::st_sf() %>%
  sf::st_transform(sf::st_crs(modDat_1_sf))) 
## do a pca of climate across level 2 ecoregions
pca <- prcomp(modDat_1_s[,paste0(prednames)])
library(factoextra)
(fviz_pca_ind(pca, habillage = modDat_1_s$NA_L2NAME, label = "none", addEllipses = TRUE, ellipse.level = .95, ggtheme = theme_minimal(), alpha.ind = .1))

# make a table of n for each region
modDat_1_s %>% 
  group_by(NA_L2NAME) %>% 
  dplyr::summarize("Number_Of_Observations" = length(NA_L2NAME)) %>% 
  rename("Level_2_Ecoregion" = NA_L2NAME)%>% 
  kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Level_2_Ecoregion Number_Of_Observations
ATLANTIC HIGHLANDS 827
CENTRAL USA PLAINS 71
COLD DESERTS 144290
EVERGLADES 2
MARINE WEST COAST FOREST 2266
MEDITERRANEAN CALIFORNIA 11356
MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS 1445
MIXED WOOD PLAINS 1179
MIXED WOOD SHIELD 1014
OZARK/OUACHITA-APPALACHIAN FORESTS 1993
SOUTH CENTRAL SEMIARID PRAIRIES 96157
SOUTHEASTERN USA PLAINS 3816
TAMAULIPAS-TEXAS SEMIARID PLAIN 6476
TEMPERATE PRAIRIES 12890
TEXAS-LOUISIANA COASTAL PLAIN 3888
UPPER GILA MOUNTAINS 6949
WARM DESERTS 58351
WEST-CENTRAL SEMIARID PRAIRIES 75472
WESTERN CORDILLERA 35546
WESTERN SIERRA MADRE PIEDMONT 5975

Then, look at the spatial distribution and environmental characteristics of the grouped ecoregions

map1 <- ggplot() +
  geom_sf(data=cropped_states,fill='white') +
  geom_sf(data=modDat_1_sf#[modDat_1_sf$NA_L2NAME %in% c("MIXED WOOD PLAINS"),]
          ,
          aes(fill=as.factor(NA_L2NAME)),linewidth=0.5,alpha=0.5) +
  geom_point(data=modDat_1_s#[modDat_1_s$NA_L2NAME %in% c("MIXED WOOD PLAINS"),]
             ,
             alpha=0.5, 
             aes(x = x, y = y, color=as.factor(NA_L2NAME)), alpha = .1) +
  #scale_fill_okabeito() +
  #scale_color_okabeito() +
 # theme_default() +
  theme(legend.position = 'none') +
  labs(title = "Level 2 Ecoregions as spatial blocks")

hull <- modDat_1_sf %>%
  ungroup() %>%
  group_by(NA_L2NAME) %>%
  slice(chull(tmean, prcp))

plot1<-ggplot(data=modDat_1_sf,aes(x=tmean,y=prcp)) +
  geom_polygon(data = hull, alpha = 0.25,aes(fill=NA_L2NAME) )+
  geom_point(aes(group=NA_L2NAME,color=NA_L2NAME),alpha=0.25) +
  theme_minimal() + xlab("Annual Average T_mean - long-term average") +
  ylab("Annual Average Precip - long-term average") #+
  #scale_color_okabeito() +
  #scale_fill_okabeito()

plot2<-ggplot(data=modDat_1_sf %>%
                pivot_longer(cols=tmean:prcp),
              aes(x=value,group=name)) +
  # geom_polygon(data = hull, alpha = 0.25,aes(fill=fold) )+
  geom_density(aes(group=NA_L2NAME,fill=NA_L2NAME),alpha=0.25) +
  theme_minimal() +
  facet_wrap(~name,scales='free')# +
  #scale_color_okabeito() +
  #scale_fill_okabeito()
 
library(patchwork)
(combo <- (map1+plot1)/plot2) 

# 
# ggplot(data = modDat_1_s) +
#   geom_density(aes(ShrubCover_transformed, col = NA_L2NAME)) +
#   xlim(c(0,100))

Fit a global model with all of the data

First, fit a LASSO regression model using the glmnet R package

  • This regression is a beta glm with a logit link (using custom function from Daniel)
  • Use cross validation across level 2 ecoregions to tune the lambda parameter in the LASSO model
  • this model is fit to using the scaled weather/climate/soils variables
  • this list of possible predictors includes:
    1. main effects
    2. interactions between all soils variables
    3. interactions between climate and weather variables
    4. transformed main effects (squared, log-transformed (add a uniform integer – 20– to all variables prior to log-transformation), square root-transformed (add a uniform integer – 20– to all variables prior to log-transformation))

Get rid of transformed predictions and interactions that are correlated

# get first pass of names correlated variables
X_df <- X %>% 
  as.data.frame() %>% 
  dplyr::select(-'(Intercept)')  
corrNames_i <- X_df %>% 
  cor()  %>% 
   caret::findCorrelation(cutoff = .7, verbose = FALSE, names = TRUE, exact = TRUE)
# remove those names that are untransformed main effects 
  # vector of columns to remove 
badNames <- corrNames_i[!(corrNames_i %in% prednames)]
while (sum(!(corrNames_i %in% prednames))>0) {
 corrNames_i <-  X_df %>% 
    dplyr::select(-badNames) %>% 
     cor()  %>% 
   caret::findCorrelation(cutoff = .7, verbose = FALSE, names = TRUE, exact = TRUE)
 # update the vector of names to remove 
 badNames <- unique(c(badNames, corrNames_i[!(corrNames_i %in% prednames)]))
}

## see if there are any correlated variables left (would be all main effects at this point)
# if there are, step through and remove the variable that each is most correlated with 
if (length(corrNames_i)>1) {
  for (i in 1:length(corrNames_i)) {
    X_i <- X_df %>% 
      dplyr::select(-badNames)
    if (corrNames_i[i] %in% names(X_i)) {
    corMat_i <- cor(x = X_i[corrNames_i[i]], y = X_i %>% dplyr::select(-corrNames_i[i])) 
    badNames_i <- colnames(corMat_i)[abs(corMat_i)>=.7]
    # if there are any predictors in the 'badNames_i', remove them from this list
    if (length(badNames_i) > 0 & sum(c(badNames_i %in% prednames))>0) {
        badNames_i <- badNames_i[!(badNames_i %in% prednames)]
    }
    badNames <- unique(c(badNames, badNames_i))
    }
  }
}
## update the X matrix to exclude these correlated variables
X <- X[,!(colnames(X) %in% badNames)]

Run the global LASSO model

if (run == TRUE) {
  # set up custom folds
    # get the ecoregions for training lambda
  train_eco <- modDat_1_s$NA_L2NAME#[train]
  
  # Fit model -----------------------------------------------
  # specify leave-one-year-out cross-validation
  my_folds <- as.numeric(as.factor(train_eco))

    # set up parallel processing
    # this computer has 16 cores (parallel::detectCores())
    registerDoMC(cores = 8)
    
    fit <- cv.glmnet(
      x = X[,2:ncol(X)], 
      y = y, 
      family = "binomial",
      keep = FALSE,
      alpha = 1,  # 0 == ridge regression, 1 == lasso, 0.5 ~~ elastic net
      lambda = lambdas,
      relax = ifelse(response == "ShrubCover", yes = TRUE, no = FALSE),
      #nlambda = 100,
      type.measure="mse",
      #penalty.factor = pen_facts,
      foldid = my_folds,
      #thresh = thresh,
      standardize = FALSE, ## scales variables prior to the model sequence... coefficients are always returned on the original scale
      parallel = TRUE
    )
    base::saveRDS(fit, paste0("../ModelFitting/models/yesOrNoTrees_globalLASSOmod_binomial_treeCutoff_",treeThreshold,".rds"))
    
  
     best_lambda <- fit$lambda.min
  # save the lambda for the most regularized model w/ an MSE that is still 1SE w/in the best lambda model
  lambda_1SE <- fit$lambda.1se
  # save the lambda for the most regularized model w/ an MSE that is still .5SE w/in the best lambda model
  lambda_halfSE <- best_lambda + ((lambda_1SE - best_lambda)/2)
 
## Now, we need to do stability selection to ensure the coefficients that are being chosen with each lambda are stable 

## stability selection for best lambda model 
# setup params
p <- ncol(X[,2:ncol(X)]) # of parameters
n <- length(y) # of observations
n_iter <- 100        # number of subsamples
sample_frac <- 0.75  # fraction of data to subsample
lambda_val <- best_lambda    # fixed lambda value (could be chosen via CV)

# Track selection
selection_counts_bestL <- matrix(0, nrow = p, ncol = 1)

for (i in 1:n_iter) {
  # Subsample rows
  sample_idx <- sample(1:n, size = floor(sample_frac * n), replace = FALSE)
  X_sub <- X[sample_idx, ]
  y_sub <- y[sample_idx]

  # Fit Lasso model
  fit_stab_i <- glmnet(x = X_sub[,2:ncol(X_sub)], y = y_sub, 
    family = "binomial",
    alpha = 1, lambda = lambda_val, standardize = FALSE)

  # Get non-zero coefficients (excluding intercept)
  select_bestL <- which(as.vector(coef(fit_stab_i))[-1] != 0)
  selection_counts_bestL[select_bestL] <- selection_counts_bestL[select_bestL] + 1
}

# Convert counts to selection probabilities (the probability that a variable is selected over 100 iterations)
selection_prob_bestL <- selection_counts_bestL / n_iter
selection_prob_bestL_df <- data.frame(
  VariableNumber = paste0("X", 1:p),
  VariableName = rownames(coef(fit_stab_i))[2:(p+1)],
  SelectionProb = as.vector(selection_prob_bestL)
)

# get those variables that are selected in ≥70% of subsets (Meinshausen and Bühlmann, 2010)
bestLambda_coef <- selection_prob_bestL_df[selection_prob_bestL_df$SelectionProb>=.7, c("VariableName", "SelectionProb")]

#//////
# stability selection for 1se lambda model
lambda_val <-  lambda_1SE    # fixed lambda value (could be chosen via CV)

# Track selection
selection_counts_1seL <- matrix(0, nrow = p, ncol = 1)

for (i in 1:n_iter) {
  # Subsample rows
  sample_idx <- sample(1:n, size = floor(sample_frac * n), replace = FALSE)
  X_sub <- X[sample_idx, ]
  y_sub <- y[sample_idx]

  # Fit Lasso model
  fit_stab_i <- glmnet(x = X_sub[,2:ncol(X_sub)], y = y_sub, 
    family = "binomial",
    alpha = 1, lambda = lambda_val, standardize = FALSE)

  # Get non-zero coefficients (excluding intercept)
  selected_1seL <- which(as.vector(coef(fit_stab_i))[-1] != 0)
  selection_counts_1seL[selected_1seL] <- selection_counts_1seL[selected_1seL] + 1
}

# Convert counts to selection probabilities (the probability that a variable is selected over 100 iterations)
selection_prob_1seL <- selection_counts_1seL / n_iter
selection_prob_1seL_df <- data.frame(
  VariableNumber = paste0("X", 1:p),
  VariableName = rownames(coef(fit_stab_i))[2:(p+1)],
  SelectionProb = as.vector(selection_prob_1seL)
)

# get those variables that are selected in ≥70% of subsets (Meinshausen and Bühlmann, 2010)
seLambda_coef <- selection_prob_1seL_df[selection_prob_1seL_df$SelectionProb>=.7, c("VariableName", "SelectionProb")]

# stability selection for half se lambda model
lambda_val <- lambda_halfSE    # fixed lambda value (could be chosen via CV)

# Track selection
selection_counts_halfseL <- matrix(0, nrow = p, ncol = 1)

for (i in 1:n_iter) {
  # Subsample rows
  sample_idx <- sample(1:n, size = floor(sample_frac * n), replace = FALSE)
  X_sub <- X[sample_idx, ]
  y_sub <- y[sample_idx]

  # Fit Lasso model
  fit_stab_i <- glmnet(x = X_sub[,2:ncol(X_sub)], y = y_sub, 
    family = "binomial",
    alpha = 1, lambda = lambda_val, standardize = FALSE)

  # Get non-zero coefficients (excluding intercept)
  selected_halfseL <- which(as.vector(coef(fit_stab_i))[-1] != 0)
  selection_counts_halfseL[selected_halfseL] <- selection_counts_halfseL[selected_halfseL] + 1
}

# Convert counts to selection probabilities (the probability that a variable is selected_halfseL over 100 iterations)
selection_prob_halfseL <- selection_counts_halfseL / n_iter
selection_prob_halfseL_df <- data.frame(
  VariableNumber = paste0("X", 1:p),
  VariableName = rownames(coef(fit_stab_i))[2:(p+1)],
  SelectionProb = as.vector(selection_prob_halfseL)
)

# get those variables that are selected_halfseL_halfseL in ≥70% of subsets (Meinshausen and Bühlmann, 2010)
halfseLambda_coef <- selection_prob_halfseL_df[selection_prob_halfseL_df$SelectionProb>=.7, c("VariableName", "SelectionProb")]

## fit w/ the identified coefficients from the 'best' lambda, but using the glm function
  mat_glmnet_best <- bestLambda_coef$VariableName 

  if (length(mat_glmnet_best) == 0) {
    f_glm_bestLambda <- as.formula(paste0("TotalTreeCover_binom", "~ 1"))
  } else {
  f_glm_bestLambda <- as.formula(paste0("TotalTreeCover_binom", " ~ ", paste0(mat_glmnet_best, collapse = " + ")))
  }
  
## fit using betareg
  fit_glm_bestLambda <- fit_glm_bestLambda_binomial <- glm(formula = f_glm_bestLambda, data = modDat_1_s, family = binomial)
    
   ## fit w/ the identified coefficients from the '1se' lambda, but using the glm function
  mat_glmnet_1se <- seLambda_coef$VariableName
    
  if (length(mat_glmnet_1se) == 0) {
    f_glm_1se <- as.formula(paste0("TotalTreeCover_binom", "~ 1"))
  } else {
  f_glm_1se <- as.formula(paste0("TotalTreeCover_binom", " ~ ", paste0(mat_glmnet_1se, collapse = " + ")))
  }


  fit_glm_se <- glm(formula = f_glm_1se, data = modDat_1_s, family = binomial)
  # glm(data = modDat_1_s, formula = f_glm_1se,
  #                   family =  stats::Gamma(link = "log"))
  
     ## fit w/ the identified coefficients from the '.5se' lambda, but using the glm function
  mat_glmnet_halfse <- halfseLambda_coef$VariableName
  
  if (length(mat_glmnet_halfse) == 0) {
    f_glm_halfse <- as.formula(paste0("TotalTreeCover_binom", "~ 1"))
  } else {
  f_glm_halfse <- as.formula(paste0("TotalTreeCover_binom", " ~ ", paste0(mat_glmnet_halfse, collapse = " + ")))
  }

  fit_glm_halfse <- glm(formula = f_glm_halfse, data = modDat_1_s, family = binomial )
  
  ## save models 
  saveRDS(fit_glm_bestLambda, paste0("./models/yesOrNoTrees_bestLambdaGLM_",treeThreshold,".rds"))
  saveRDS(fit_glm_halfse, paste0("./models/yesOrNoTrees_halfSELambdaGLM_",treeThreshold,".rds"))
  saveRDS(fit_glm_se, paste0("./models/yesOrNoTrees_oneSELambdaGLM_",treeThreshold,".rds"))
    
  ## save the R environment after running the models 
  save(f_glm_halfse, mat_glmnet_halfse, halfseLambda_coef,
              f_glm_1se, mat_glmnet_1se, seLambda_coef,
              f_glm_bestLambda, mat_glmnet_best, bestLambda_coef,
              file = paste0("./models/interimModelFittingObjects_yesOrNoTrees_binomial_",treeThreshold,".rds"))
  } else {
    # read in LASSO object
    fit <- readRDS(paste0("../ModelFitting/models/yesOrNoTrees_globalLASSOmod_binomial_treeCutoff_",treeThreshold,".rds"))
    
    # read in R objects having to do w/ model fitting 
    load(file = paste0("./models/interimModelFittingObjects_yesOrNoTrees_binomial_",treeThreshold,".rds"))
      
  fit_glm_bestLambda <- readRDS(paste0("./models/yesOrNoTrees_bestLambdaGLM_",treeThreshold,".rds"))
  fit_glm_halfse <- readRDS(paste0("./models/yesOrNoTrees_halfSELambdaGLM_",treeThreshold,".rds"))
  fit_glm_se <- readRDS(paste0("./models/yesOrNoTrees_oneSELambdaGLM_",treeThreshold,".rds"))
  }


  # assess model fit
  # assess.glmnet(fit$fit.preval, #newx = X[,2:293], 
  #               newy = y, family = stats::Gamma(link = "log"))
  # save the minimum lambda
  best_lambda <- fit$lambda.min
  # save the lambda for the most regularized model w/ an MSE that is still 1SE w/in the best lambda model
  lambda_1SE <- fit$lambda.1se
  # save the lambda for the most regularized model w/ an MSE that is still .5SE w/in the best lambda model
  lambda_halfSE <- best_lambda + ((lambda_1SE - best_lambda)/2)
 
  print(fit)     
## 
## Call:  cv.glmnet(x = X[, 2:ncol(X)], y = y, lambda = lambdas, type.measure = "mse",      foldid = my_folds, keep = FALSE, parallel = TRUE, relax = ifelse(response ==          "ShrubCover", yes = TRUE, no = FALSE), family = "binomial",      alpha = 1, standardize = FALSE) 
## 
## Measure: Mean-Squared Error 
## 
##      Lambda Index Measure      SE Nonzero
## min 0.00365   115  0.2447 0.03089      22
## 1se 0.04150    80  0.2736 0.02847       5
  plot(fit)

Then, we predict (on the training set) using both of these models (best lambda and 1se lambda) as an itial test

  ## predict on the test data
  # lasso model predictions with the optimal lambda
  optimal_pred <- predict(fit_glm_bestLambda, newx=X[,2:ncol(X)], type = "response")
  optimal_pred_1se <-  predict(fit_glm_se, newx=X[,2:ncol(X)], type = "response")
  optimal_pred_halfse <- predict(fit_glm_halfse, newx = X[,2:ncol(X)], type = "response")
  
    null_fit <- glm(
      formula = y ~ 1, #data = modDat_1_s, 
      family = binomial
      )
  null_pred <- predict(null_fit, newdata = as.data.frame(X), type = "response"
                       )

  # save data
  fullModOut <- list(
    "modelObject" = fit,
    "nullModelObject" = null_fit,
    "modelPredictions" = data.frame(#ecoRegion_holdout = rep(test_eco,length(y)),
      obs=y,
                    pred_opt=optimal_pred, 
                    pred_opt_se = optimal_pred_1se,
                    pred_opt_halfse = optimal_pred_halfse,
                    pred_null=null_pred#,
                    #pred_nopenalty=nopen_pred
                    ))

ggplot() + 
  geom_point(aes(X[,2], fullModOut$modelPredictions$obs), col = "black", alpha = .1) + 
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt), col = "red", alpha = .1) + ## predictions w/ the CV model
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt_halfse), col = "orange", alpha = .1) + ## predictions w/ the CV model (.5se lambda)
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt_se), col = "green", alpha = .1) + ## predictions w/ the CV model (1se lambda)
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_null), col = "blue", alpha = .1) + 
  labs(title = "A rough comparison of observed and model-predicted values", 
       subtitle = "black = observed values \n red = predictions from 'best lambda' model \n orange = predictions for '1/2se' lambda model \n green = predictions from '1se' lambda model \n blue = predictions from null model") +
  xlab(colnames(X)[2])

  #ylim(c(0,200))

The internal cross-validation process to fit the global LASSO model identified an optimal lambda value (regularization parameter) of r{print(best_lambda)}. The lambda value such that the cross validation error is within 1 standard error of the minimum (“1se lambda”) was `r{print(fit$lambda.1se)}`` . The following coefficients were kept in each model:

# the coefficient matrix from the 'best model' -- find and print those coefficients that aren't 0 in a table
coef_glm_bestLambda <- coef(fit_glm_bestLambda) %>% 
  data.frame() 
coef_glm_bestLambda$coefficientName <- rownames(coef_glm_bestLambda)
names(coef_glm_bestLambda)[1] <- "coefficientValue_bestLambda"
# coefficient matrix from the '1se' model 
coef_glm_1se <- coef(fit_glm_se) %>% 
  data.frame() 
coef_glm_1se$coefficientName <- rownames(coef_glm_1se)
names(coef_glm_1se)[1] <- "coefficientValue_1seLambda"
# coefficient matrix from the 'half se' model 
coef_glm_halfse <- coef(fit_glm_halfse) %>% 
  data.frame() 
coef_glm_halfse$coefficientName <- rownames(coef_glm_halfse)
names(coef_glm_halfse)[1] <- "coefficientValue_halfseLambda"
# add together
coefs <- full_join(coef_glm_bestLambda, coef_glm_halfse) %>% 
  full_join(coef_glm_1se) %>% 
  dplyr::select(coefficientName, coefficientValue_bestLambda,
                coefficientValue_halfseLambda, coefficientValue_1seLambda)

globModTerms <- coefs[!is.na(coefs$coefficientValue_bestLambda), "coefficientName"]

## also, get the number of unique variables in each model 
var_prop_pred <- paste0(response, "_pred")
response_vars <- c(response, var_prop_pred)
# for best lambda model
prednames_fig <- paste(str_split(globModTerms, ":", simplify = TRUE)) 
prednames_fig <- str_replace(prednames_fig, "I\\(", "")
prednames_fig <- str_replace(prednames_fig, "\\^2\\)", "")
prednames_fig <- unique(prednames_fig[prednames_fig>0])
prednames_fig <- prednames_fig
prednames_fig_num <- length(prednames_fig)
# for 1SE lambda model
globModTerms_1se <- coefs[!is.na(coefs$coefficientValue_1seLambda), "coefficientName"]
if (length(globModTerms_1se) == 1) {
prednames_fig_1se <- paste(str_split(globModTerms_1se, ":", simplify = TRUE)) 
prednames_fig_1se <- str_replace(prednames_fig_1se, "I\\(", "")
prednames_fig_1se <- str_replace(prednames_fig_1se, "\\^2\\)", "")
prednames_fig_1se <- unique(prednames_fig_1se[prednames_fig_1se>0])
prednames_fig_1se_num <- c(0)
} else {
prednames_fig_1se <- paste(str_split(globModTerms_1se, ":", simplify = TRUE)) 
prednames_fig_1se <- str_replace(prednames_fig_1se, "I\\(", "")
prednames_fig_1se <- str_replace(prednames_fig_1se, "\\^2\\)", "")
prednames_fig_1se <- unique(prednames_fig_1se[prednames_fig_1se>0])
prednames_fig_1se_num <- length(prednames_fig_1se)
}
# for 1/2SE lambda model
globModTerms_halfse <- coefs[!is.na(coefs$coefficientValue_halfseLambda), "coefficientName"]
if (length(globModTerms_halfse) == 1) {
prednames_fig_halfse <- paste(str_split(globModTerms_halfse, ":", simplify = TRUE)) 
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "I\\(", "")
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "\\^2\\)", "")
prednames_fig_halfse <- unique(prednames_fig_halfse[prednames_fig_halfse>0])
prednames_fig_halfse_num <- c(0)
} else {
prednames_fig_halfse <- paste(str_split(globModTerms_halfse, ":", simplify = TRUE)) 
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "I\\(", "")
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "\\^2\\)", "")
prednames_fig_halfse <- unique(prednames_fig_halfse[prednames_fig_halfse>0])
prednames_fig_halfse_num <- length(prednames_fig_halfse)
}
# make a table
kable(coefs, col.names = c("Coefficient Name", "Value from best lambda model", 
                           "Value from 1/2 se lambda", "Value from 1se lambda model")
      ) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Coefficient Name Value from best lambda model Value from 1/2 se lambda Value from 1se lambda model
(Intercept) -0.6747817 -0.8616112 -1.3908996
prcp 1.7134506 1.9694935 1.8591155
prcp_seasonality -0.0163393 -0.3086721 -0.5178721
prcpTempCorr -0.9128524 -0.5967287 -0.2352708
annWetDegDays 0.5337744 NA NA
annWatDef_95 -0.6052394 NA NA
coarse 0.2057888 NA NA
AWHC -0.6156823 -0.7655963 -0.8858988
I(tmean^2) -0.3181863 NA NA
I(prcp_seasonality^2) 0.2010413 NA NA
I(prcpTempCorr^2) -0.8905565 -0.6416890 NA
I(isothermality^2) 0.1422692 NA NA
I(clay^2) -0.1181021 -0.1661580 -0.1421531
I(AWHC^2) 0.1127803 NA NA
annWetDegDays:annWatDef_95 0.9165646 NA NA
prcp_dry:isothermality -0.2965084 -0.4376182 NA
prcpTempCorr:isothermality -0.0206105 NA NA
isothermality:tmean -0.2850425 NA NA
prcpTempCorr:prcp_dry 0.5054909 NA NA
prcpTempCorr:tmean -0.2528563 NA NA
AWHC:clay -0.1629028 -0.2506869 NA
coarse:clay 0.0916325 NA NA
# calculate RMSE of all models 
RMSE_best <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt")], truth = "obs", estimate = "pred_opt")$.estimate
RMSE_halfse <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt_halfse")], truth = "obs", estimate = "pred_opt_halfse")$.estimate
RMSE_1se <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt_se")], truth = "obs", estimate = "pred_opt_se")$.estimate
# calculate bias of all models
bias_best <-  mean((fullModOut$modelPredictions$obs) - fullModOut$modelPredictions$pred_opt)
bias_halfse <-  mean((fullModOut$modelPredictions$obs) - fullModOut$modelPredictions$pred_opt_halfse)
bias_1se <- mean((fullModOut$modelPredictions$obs) - fullModOut$modelPredictions$pred_opt_se)

uniqueCoeffs <- data.frame("Best lambda model" = c(signif(RMSE_best,3), as.character(signif(bias_best, 3)),
  as.integer(length(globModTerms)-1), as.integer(prednames_fig_num), 
                                                   as.integer(sum(prednames_fig %in% c(prednames_clim))),
                                                   as.integer(sum(prednames_fig %in% c(prednames_weath))),
                                                   as.integer(sum(prednames_fig %in% c(prednames_soils)))
                                                   ), 
                           "1/2 se lambda model" = c(signif(RMSE_halfse,3), as.character(signif(bias_halfse, 3)),
                             length(globModTerms_halfse)-1, prednames_fig_halfse_num,
                                                   sum(prednames_fig_halfse %in% c(prednames_clim)),
                                                   sum(prednames_fig_halfse %in% c(prednames_weath)),
                                                   sum(prednames_fig_halfse %in% c(prednames_soils))), 
                           "1se lambda model" = c(signif(RMSE_1se,3), as.character(signif(bias_1se, 3)),
                             length(globModTerms_1se)-1, prednames_fig_1se_num,
                                                   sum(prednames_fig_1se %in% c(prednames_clim)),
                                                   sum(prednames_fig_1se %in% c(prednames_weath)),
                                                   sum(prednames_fig_1se %in% c(prednames_soils))))
row.names(uniqueCoeffs) <- c("RMSE", "bias: mean(obs-pred.)", "Total number of coefficients", "Number of unique coefficients",
                             "Number of unique climate coefficients", 
                             "Number of unique weather coefficients",  
                             "Number of unique soils coefficients"
                             )

kable(uniqueCoeffs, 
      col.names = c("Best lambda model", "1/2 se lambda model", "1se lambda model"), row.names = TRUE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Best lambda model 1/2 se lambda model 1se lambda model
RMSE 0.315 0.323 0.333
bias: mean(obs-pred.) -2.79e-14 -2.15e-13 8.62e-14
Total number of coefficients 21 8 5
Number of unique coefficients 11 7 5
Number of unique climate coefficients 8 5 3
Number of unique weather coefficients 0 0 0
Number of unique soils coefficients 3 2 2

Visualizations of Model Predictions and Residuals

In the following figures, we show model predictions made using the best lambda model, as well as an alternative “second-best” lambda model. As the alternative to the best lambda model, we use the model (1se or 1/2se of best Lambda) that has the fewest number of unique predictors, but is not a null model.

if (whichSecondBestMod == "auto") {
  # name of model w/ fewest # of predictors (but more than 0)
uniqueCoeff_min <- min(as.numeric(uniqueCoeffs[4,2:3])[which(as.numeric(uniqueCoeffs[4,2:3]) > 0)])
alternativeModel <- names(uniqueCoeffs[4,2:3])[which(uniqueCoeffs[4,2:3] == uniqueCoeff_min)]

if (is.finite(uniqueCoeff_min)) {
  if (length(alternativeModel) == 1) {
  if (alternativeModel == "X1.2.se.lambda.model") {
  mod_secondBest <- fit_glm_halfse
  name_secondBestMod <- "1/2 SE Model"
  prednames_secondBestMod <- prednames_fig_halfse
} else if (alternativeModel == "X1se.lambda.model") {
  mod_secondBest <- fit_glm_se
  name_secondBestMod <- "1 SE Model"
  prednames_secondBestMod <- prednames_fig_1se
}
} else {
  # if both alternative models have the same number of unique coefficients, chose the model that has the fewest number of total predictors
  uniqueCoeff_min2 <- min(as.numeric(uniqueCoeffs[3,alternativeModel]))
alternativeModel2 <- names(uniqueCoeffs[3,alternativeModel])[which(uniqueCoeffs[3,alternativeModel] == uniqueCoeff_min2)]
if (length(alternativeModel2) == 1) {
  if (alternativeModel2 == "X1.2.se.lambda.model") {
  mod_secondBest <- fit_glm_halfse
  name_secondBestMod <- "1/2 SE Model"
  prednames_secondBestMod <- prednames_fig_halfse
} else if (alternativeModel2 == "X1se.lambda.model") {
  mod_secondBest <- fit_glm_se
  name_secondBestMod <- "1 SE Model"
  prednames_secondBestMod <- prednames_fig_1se
}
} else {
  mod_secondBest <- fit_glm_halfse
  name_secondBestMod <- "1/2 SE Model"
  prednames_secondBestMod <- prednames_fig_halfse

}

}
  }else {
    mod_secondBest <- NULL
  name_secondBestMod <- "Intercept_Only"
  prednames_secondBestMod <- NULL
}
} else if (whichSecondBestMod == "1se") {
  mod_secondBest <- fit_glm_se
  name_secondBestMod <- "1 SE Model"
  prednames_secondBestMod <- prednames_fig_1se
} else if (whichSecondBestMod == "halfse") {
  mod_secondBest <- fit_glm_halfse
  name_secondBestMod <- "1/2 SE Model"
  prednames_secondBestMod <- prednames_fig_halfse
}

Predict on the test data

  # create prediction for each each model
# (i.e. for each fire proporation variable)
predict_by_response <- function(mod, df) {
  df_out <- df
  response_name <- paste0("TotalTreeCover_binom", "_pred")
  preds <- predict(mod, newdata = df_out, #s="lambda.min", 
                                     type = "response")
  #preds[preds<0] <- 0
  #preds[preds>100] <- 100
  df_out <- df_out %>% cbind(preds)
   colnames(df_out)[ncol(df_out)] <- response_name
  return(df_out)
}

pred_glm1 <- predict_by_response(fit_glm_bestLambda, modDat_1_sTest)#X[,2:ncol(X)])

## back-transform the 
# add back in true y values
# pred_glm1 <- pred_glm1 %>% 
#   cbind(data.frame("TotalTreeCover_binom" = modDat_1_sTest$TotalTreeCover_binom, 
#                    "TotalTreeCover" = modDat_1_sTest$TotalTreeCover))

# add back in lat/long data 
# pred_glm1 <- pred_glm1 %>% 
#   cbind(modDat_1_sTest[,c("x", "y", "Year")])

pred_glm1$resid <- pred_glm1[,"TotalTreeCover_binom"] - pred_glm1[,paste0("TotalTreeCover_binom", "_pred")]
pred_glm1$extremeResid <- NA
pred_glm1[pred_glm1$resid > .5 | pred_glm1$resid < -.5,"extremeResid"] <- 1

 # 
 # plot(density(pred_glm1$TotalTreeCover_binom_pred_binary))
 # plot(density(pred_glm1$TotalTreeCover_binom_pred))
 # abline(v = best_threshold_bestLambda, add = TRUE)
if ( name_secondBestMod == "Intercept_Only") {
  print("The next best lambda model only contains one predictor (an intercept)")
} else {

pred_glm1_1se <- predict_by_response(mod_secondBest, modDat_1_sTest)#X[,2:ncol(X)])

# # add back in true y values
# pred_glm1_1se <- pred_glm1_1se %>% 
#   cbind(data.frame("TotalTreeCover_binom" = modDat_1_s$TotalTreeCover_binom, 
#                    "TotalTreeCover" = modDat_1_s$TotalTreeCover))
# 
# # add back in lat/x data 
# pred_glm1_1se <- pred_glm1_1se %>% 
#   cbind(modDat_1_s[,c("x", "y", "Year")])

pred_glm1_1se$resid <- pred_glm1_1se[,"TotalTreeCover_binom"] - pred_glm1_1se[,paste0("TotalTreeCover_binom", "_pred")]
pred_glm1_1se$extremeResid <- NA
pred_glm1_1se[pred_glm1_1se$resid > .5 | pred_glm1_1se$resid < -.5,"extremeResid"] <- 1

 # plot(density(pred_glm1_1se$TotalTreeCover_binom_pred_binary))
 # plot(density(pred_glm1_1se$TotalTreeCover_binom_pred))
 # abline(v = best_threshold_secondBest, add = TRUE)
}

Inital Model Diagnostics

ROC Curve based on classification accuracy of the model when used on the testing set (heldout years 2003, 2012 and 2019)

# use the fitted model to predict on the test set 
# Predict probabilities on test data
#predicted_prob <- predict(fit_glm_bestLambda, newdata = modDat_1_sTest, type = "response")

# Create ROC curve
roc_curve <- roc(pred_glm1$TotalTreeCover_binom, pred_glm1$TotalTreeCover_binom_pred)
# Plot ROC curve
plot(roc_curve, main =  paste0("ROC Curve for best lambda model \n","AUC: ",round(roc_curve$auc,2)))

# identify optimal threshold via ROC curve
# Identify threshold maximizing sensitivity and specificity
# coords <- coords(roc_curve, "best", best.method = "youden")
# best_threshold_bestLambda <- coords$threshold
# 
#cat("Optimal threshold for best lambda model based on ROC curve, based on Youden's J statistic:", best_threshold_bestLambda, "\n")

## tryin another method of identifying the threshold
library(PresenceAbsence)
DATA <- pred_glm1
DATA$ID <- 1:nrow(DATA)
#pred.prev <- predicted.prevalence(DATA = DATA[,c("ID", "TotalTreeCover_binom", "TotalTreeCover_binom_pred")], threshold = 20)
#accu <- presence.absence.accuracy(DATA[,c("ID", "TotalTreeCover_binom", "TotalTreeCover_binom_pred")], threshold = 20)
optimalThresholds <- optimal.thresholds(DATA = DATA[,c("ID", "TotalTreeCover_binom", "TotalTreeCover_binom_pred")], 
                                        threshold = 200) # tests 100 thresholds)
rm(DATA)
best_threshold_bestLambda <- optimalThresholds[optimalThresholds$Method == thresholdMethod, 2]
cat(paste0("Optimal threshold for best lambda model based on ROC curve, based on the ",thresholdMethod, " method: ", best_threshold_bestLambda, "\n"))
## Optimal threshold for best lambda model based on ROC curve, based on the MaxKappa method: 0.326633165829146
# use the fitted model to predict on the test set 
# Predict probabilities on test data
#predicted_prob_secondBest <- predict(mod_secondBest, newdata = modDat_1_sTest, type = "response")
# Create ROC curve
roc_curve_secondBest <- roc(pred_glm1_1se$TotalTreeCover_binom, pred_glm1_1se$TotalTreeCover_binom_pred, smooth = FALSE)
# Plot ROC curve
plot(roc_curve_secondBest, main =  paste0("ROC Curve for second-best lambda model \n","AUC: ",round(roc_curve_secondBest$auc,2)))

# identify optimal threshold via ROC curve
# Identify threshold maximizing sensitivity and specificity
# coords <- coords(roc_curve_secondBest, "best", best.method = "closest.topleft")
# coords2 <- coords(roc_curve_secondBest, "best", best.method = "youden")
# best_threshold_secondBest <- coords$threshold
DATA <- pred_glm1_1se
DATA$ID <- 1:nrow(DATA)
#pred.prev <- predicted.prevalence(DATA = DATA[,c("ID", "TotalTreeCover_binom", "TotalTreeCover_binom_pred")], threshold = 20)
#accu <- presence.absence.accuracy(DATA[,c("ID", "TotalTreeCover_binom", "TotalTreeCover_binom_pred")], threshold = 20)
optimalThresholds <- optimal.thresholds(DATA = DATA[,c("ID", "TotalTreeCover_binom", "TotalTreeCover_binom_pred")], 
                                        threshold = 200) # tests 100 thresholds)
rm(DATA)
best_threshold_secondBest <- optimalThresholds[optimalThresholds$Method == thresholdMethod, 2]
cat(paste0("Optimal threshold for second-best lambda model based on ROC curve, based on the ",thresholdMethod, " method: ", best_threshold_secondBest, "\n"))
## Optimal threshold for second-best lambda model based on ROC curve, based on the MaxKappa method: 0.306532663316583

Use the identified thresholds to make predictions binary

# best lambda model
# "binomialize" the continuous predictions
pred_glm1 <- pred_glm1 %>% 
  mutate(TotalTreeCover_binom_pred_binary = TotalTreeCover_binom_pred,
    TotalTreeCover_binom_pred_binary = replace(TotalTreeCover_binom_pred_binary, TotalTreeCover_binom_pred_binary > best_threshold_bestLambda, 1),
         TotalTreeCover_binom_pred_binary = replace(TotalTreeCover_binom_pred_binary, TotalTreeCover_binom_pred_binary <= best_threshold_bestLambda, 0)) 

# second-best lambda model
# "binomialize" the continuouse predictions
pred_glm1_1se <- pred_glm1_1se %>% 
  mutate(TotalTreeCover_binom_pred_binary = TotalTreeCover_binom_pred,
    TotalTreeCover_binom_pred_binary = replace(TotalTreeCover_binom_pred_binary, TotalTreeCover_binom_pred_binary > best_threshold_secondBest, 1),
         TotalTreeCover_binom_pred_binary = replace(TotalTreeCover_binom_pred_binary, TotalTreeCover_binom_pred_binary <= best_threshold_secondBest, 0)) 

Maps of Observations, Predictions, and Residuals

Observations across the temporal range of the dataset

# rasterize
# get reference raster
# test_rast <-  rast("../../../Data_raw/dayMet/rawMonthlyData/orders/70e0da02b9d2d6e8faa8c97d211f3546/Daymet_Monthly_V4R1/data/daymet_v4_prcp_monttl_na_1980.tif") %>%
#   #terra::aggregate(fact = 3, fun = "mean") %>% 
#   terra::project(crs("EPSG:4326"))
  # transform to match format of veg. data 
  
## add ecoregion boundaries (for our ecoregion level model)
regions <- sf::st_read(dsn = "../../../Data_raw/Level2Ecoregions/", layer = "NA_CEC_Eco_Level2") 
## Reading layer `NA_CEC_Eco_Level2' from data source 
##   `/Users/astears/Documents/Dropbox_static/Work/NAU_USGS_postdoc/cleanPED/PED_vegClimModels/Data_raw/Level2Ecoregions' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 2261 features and 8 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -4334052 ymin: -3313739 xmax: 3324076 ymax: 4267265
## Projected CRS: Sphere_ARC_INFO_Lambert_Azimuthal_Equal_Area
regions <- regions %>% 
  st_transform(crs = st_crs(crs("EPSG:4326"))) %>% 
  st_make_valid() 

ecoregionLU <- data.frame("NA_L1NAME" = sort(unique(regions$NA_L1NAME)), 
                        "newRegion" = c(NA, "Forest", "dryShrubGrass", 
                                        "dryShrubGrass", "Forest", "dryShrubGrass",
                                       "dryShrubGrass", "Forest", "Forest", 
                                       "dryShrubGrass", "Forest", "Forest", 
                                       "Forest", "Forest", "dryShrubGrass", 
                                       NA
                                        ))
goodRegions <- regions %>% 
  left_join(ecoregionLU)
mapRegions <- goodRegions %>% 
  filter(!is.na(newRegion)) %>% 
  group_by(newRegion) %>% 
  summarise(geometry = sf::st_union(geometry)) %>% 
  ungroup() %>% 
  st_simplify(dTolerance = 1000) %>% 
  st_crop(ext(-130, -60, 20, 60))

# rasterize data
plotObs <- modDat_1_s %>% 
  rbind(modDat_1_sTest) %>% #pred_glm1 %>% 
         drop_na("TotalTreeCover_binom") #%>% 
  # sf::st_as_sf(coords = c("x", "y"), remove = FALSE) %>% 
  # sf::st_set_crs(crs(test_rast)) #%>% 
  #slice_sample(n = 5e4) %>%
  #terra::vect(geom = c("x", "y")) %>% 
  #terra::set.crs(crs(test_rast)) #%>% 
 #   terra::rasterize(y = test_rast, 
 #                    field = "TotalTreeCover_binom", 
 #                  fun = function(x) {
 #                    round(mean(x, na.rm = TRUE))
 #                  }) %>% 
 # terra::crop(ext(-130, -60, 20, 60))

# make shapefile of cropped state boundaries in appropriate crs
cropped_states_2 <- cropped_states %>% 
  st_transform(crs = "EPSG:4326") %>% 
  st_make_valid() %>% 
  st_crop(ext(-130, -60, 20, 60))

# make figure of raw tree cover
map_obs_cont <- ggplot() +
#geom_spatraster(data = plotObs) + 
 #geom_sf(data = plotObs, aes(col = TotalTreeCover_binom)) + 
   stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover), fun = mean, binwidth = .05) + 
  geom_sf(data=cropped_states_2 ,fill=NA ) +
  geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Observations of Continuous Total Tree Cover")) +
  scale_fill_gradient2(low = "brown",
                       mid = "wheat" ,
                       high = "darkgreen" , 
                       midpoint = 0,   na.value = "darkgrey") + 
  xlim(-125, -65) + 
  ylim(25, 50)

# make figure of binomialized tree cover
map_obs <- ggplot() +
#geom_spatraster(data = plotObs) + 
 #geom_sf(data = plotObs, aes(col = TotalTreeCover_binom)) + 
   stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover_binom), fun = mean, binwidth = .05) + 
     scale_fill_viridis_c(option = "A", guide = guide_colorbar(title = "% cover"), 
                          limits = c(0,100))  +
  geom_sf(data=cropped_states_2 ,fill=NA ) +
  geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Observations of Binomial Total Tree Cover")) +
  scale_fill_gradient2(low = "brown",
                       mid = "wheat" ,
                       high = "darkgreen" , 
                       midpoint = 0,   na.value = "darkgrey") + 
  xlim(-125, -65) + 
  ylim(25, 50)

hist_obs_cont <- modDat_1_s %>% 
  rbind(modDat_1_sTest) %>% 
  ggplot() + 
  geom_histogram(aes(TotalTreeCover), fill = "lightgrey", col = "darkgrey")

hist_obs <- 
  modDat_1_s %>% 
  rbind(modDat_1_sTest) %>% 
  ggplot() + 
  geom_histogram(aes(TotalTreeCover_binom), fill = "lightgrey", col = "darkgrey")

library(ggpubr)
ggarrange(map_obs_cont, hist_obs_cont, map_obs, hist_obs, heights = c(3, 1, 3, 1), ncol = 1)

Predictions on the test set using the best lambda model

# 
# # rasterize data
plotPred <- pred_glm1 %>%
         drop_na(paste0("TotalTreeCover_binom","_pred")) #%>%
#   #slice_sample(n = 5e4) %>%
#   sf::st_as_sf(coords = c("x", "y"), remove = FALSE) %>% 
#   sf::st_set_crs(crs(test_rast))
#   # terra::rasterize(y = test_rast, 
#   #                  field = paste0("TotalTreeCover_binom","_pred"), 
#   #                  fun = mean) %>% 
#   # terra::crop(ext(-130, -60, 20, 60))
# 

# make figure - continuous predictions
map_preds1_cont <- ggplot() +
#geom_spatraster(data = plotPred) + 
  stat_summary_2d(data = plotPred, aes(x = x, y = y, z = TotalTreeCover_binom_pred), fun = mean, binwidth = .05) +
  geom_sf(data=cropped_states_2,fill=NA )  + 
  geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Predictions from the 'best lambda' fitted model of Yes/No Trees \ncontinuous probabilities"),
     subtitle =  "bestLambda model")  +
  scale_fill_gradient2(low = "wheat",
                       mid = "darkgreen",
                       high = "red" , 
                       midpoint = 1,   na.value = "darkgrey",
                       limits = c(0,1)) + 
  xlim(-125, -65) + 
  ylim(25, 50)

# make figure - binomialized predictions
map_preds1 <- ggplot() +
#geom_spatraster(data = plotPred) + 
  stat_summary_2d(data = plotPred, aes(x = x, y = y, z = TotalTreeCover_binom_pred_binary), fun = function(x) {round(mean(x))}, binwidth = .05) + 
  geom_sf(data=cropped_states_2,fill=NA )  + 
  geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Predictions from the 'best lambda' fitted model of Yes/No Trees - \n made binary"),
     subtitle =  "bestLambda model")  +
  scale_fill_gradient2(low = "wheat",
                       mid = "darkgreen",
                       high = "red" , 
                       midpoint = 1,   na.value = "darkgrey",
                       limits = c(0,1),
                       breaks = c(0,1)) + 
  xlim(-125, -65) + 
  ylim(25, 50)

hist_preds1_cont <- ggplot(pred_glm1) + 
  geom_histogram(aes(.data[[paste0("TotalTreeCover_binom", "_pred")]]), fill = "lightgrey", col = "darkgrey")

hist_preds1 <- ggplot(pred_glm1) + 
  geom_histogram(aes(x = TotalTreeCover_binom_pred_binary), fill = "lightgrey", col = "darkgrey")#+ 
  #xlim(c(-.01,1.01))

ggarrange(map_preds1_cont, hist_preds1_cont, map_preds1, hist_preds1, heights = c(3,1,3,1), ncol = 1)

Predictions on the test set using the second best lambda model

if ( name_secondBestMod == "Intercept_Only") {
  print("The next best lambda model only contains one predictor (an intercept)")
} else {

# rasterize data
plotPred <- pred_glm1_1se %>% 
         drop_na(paste0("TotalTreeCover_binom","_pred")) #%>% 
  # #slice_sample(n = 5e4) %>%
  # sf::st_as_sf(coords = c("x", "y"), remove = FALSE) %>% 
  # sf::st_set_crs(crs(test_rast))
  # terra::vect(geom = c("x", "y")) %>% 
  # terra::set.crs(crs(test_rast)) %>% 
  # terra::rasterize(y = test_rast, 
  #                  field = paste0("TotalTreeCover_binom","_pred"), 
  #                  fun = mean) %>% 
  #    terra::crop(ext(-130, -60, 20, 60))

# make figures
map_preds2_cont <- ggplot() +
    stat_summary_2d(data = plotPred, aes(x = x, y = y, z = TotalTreeCover_binom_pred), fun = mean, binwidth = .05) + 
#geom_spatraster(data = plotPred) + 
  geom_sf(data=cropped_states_2,fill=NA )  + 
  geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Predictions from the ", name_secondBestMod, " of Yes/No Trees \ncontinuous probabilities"),
     subtitle =  name_secondBestMod)  +
  scale_fill_gradient2(low = "wheat",
                       mid = "darkgreen",
                       high = "red" , 
                       midpoint = 1,   na.value = "darkgrey",
                       limits = c(0, 1))  + 
  xlim(-125, -65) + 
  ylim(25, 50)

# make figure - binomialized predictions
map_preds2 <- ggplot() +
#geom_spatraster(data = plotPred) + 
  stat_summary_2d(data = plotPred, aes(x = x, y = y, z = TotalTreeCover_binom_pred_binary), fun = function(x) {round(mean(x))}, binwidth = .05) + 
  geom_sf(data=cropped_states_2,fill=NA )  + 
  geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Predictions from the ", name_secondBestMod, " of Yes/No Trees - \n made binary"),
     subtitle =  name_secondBestMod)  +
  scale_fill_gradient2(low = "wheat",
                       mid = "darkgreen",
                       high = "red" , 
                       midpoint = 1,   na.value = "darkgrey",
                       limits = c(0,1),
                       breaks = c(0,1)) + 
  xlim(-125, -65) + 
  ylim(25, 50)

hist_preds2_cont <- ggplot(pred_glm1_1se) + 
  geom_histogram(aes(.data[[paste0("TotalTreeCover_binom", "_pred")]]), fill = "lightgrey", col = "darkgrey")

hist_preds2 <- ggplot(pred_glm1_1se) + 
  geom_histogram(aes(x = TotalTreeCover_binom_pred_binary), fill = "lightgrey", col = "darkgrey") 

ggarrange(map_preds2_cont, hist_preds2_cont, map_preds2, hist_preds2, heights = c(3,1,3,1), ncol = 1)
}

A map of classification error of predictions on the test set made using the best lambda model

## calculate the classification error (binomial obs - binomial pred)
pred_glm1 <- pred_glm1 %>% 
  mutate(missClass = TotalTreeCover_binom - TotalTreeCover_binom_pred_binary)

# make figures
(map_missClass <- ggplot() +
#geom_spatraster(data =plotResid_rast) + 
    stat_summary_2d(data = pred_glm1, aes(x = x, y = y, z = missClass), fun = function(x) {round(mean(x))}, binwidth = .05) + 
  geom_sf(data=cropped_states_2,fill=NA )  + 
  geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
  #geom_sf(data = badResids_high, col = "blue") +
  #geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Missclassification (obs. - pred.) from the model of Yes/No Trees"),
     subtitle = "bestLambda model \n binary observations - binary predictions") +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" , 
                       midpoint = 0,   na.value = "darkgrey",
                       limits = c(-1,1),
                       breaks = c(-1,0,1),
                       labels = c("Data = no trees; Model = trees ", "aggree", 
                                  "Data = trees; Model = no trees")
                       ) + 
  xlim(-125, -65) + 
  ylim(25, 50))

# make a confusion matrix 
# prepare data
matData <- pred_glm1 %>% 
  mutate(predClass = TotalTreeCover_binom_pred_binary, 
         obsClass = TotalTreeCover_binom) %>% 
  mutate(predClass = replace(predClass, predClass == 0, "no trees"),
         predClass = replace(predClass, predClass == 1, "trees"),
         obsClass = replace(obsClass, obsClass == 0, "no trees"),
         obsClass = replace(obsClass, obsClass == 1, "trees")) %>% 
mutate(predClass = as.factor(predClass) ,
       obsClass = as.factor(obsClass))

# make matrix as a data.frame
# confMat <-  confusionMatrix(data = matData$predClass,
#                  reference = matData$obsClass)
ConfusionTableR::binary_visualiseR(
  train_labels = as.factor(matData$predClass),
  truth_labels = as.factor(matData$obsClass),
   class_label1 = "No trees",
   class_label2 = "Trees",
   quadrant_col1 = "wheat",
   quadrant_col2 = "darkgreen",
  text_col = "white"
)

###A map of classification error of predictions on the test set made using the second best lambda model

## calculate the classification error (binomial obs - binomial pred)
pred_glm1_1se <- pred_glm1_1se %>% 
  mutate(missClass = TotalTreeCover_binom - TotalTreeCover_binom_pred_binary)

# make figures
map_missClass <- ggplot() +
#geom_spatraster(data =plotResid_rast) + 
    stat_summary_2d(data = pred_glm1_1se, aes(x = x, y = y, z = missClass), fun = function(x) {round(mean(x))}, binwidth = .05) + 
  geom_sf(data=cropped_states_2,fill=NA )  + 
  geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
  #geom_sf(data = badResids_high, col = "blue") +
  #geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Missclassification (obs. - pred.) from the model of Yes/No Trees"),
     subtitle = paste0(name_secondBestMod, "\nbinomialized observations - binomial predictions")) +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" , 
                       midpoint = 0,   na.value = "darkgrey",
                       limits = c(-1,1),
                       breaks = c(-1,0,1),
                       labels = c("Data = no trees; Model = trees ", "aggree", 
                                  "Data = trees; Model = no trees")
                       ) + 
  xlim(-125, -65) + 
  ylim(25, 50)

map_missClass

# make a confusion matrix 
# prepare data
matData <- pred_glm1_1se %>% 
  mutate(predClass = TotalTreeCover_binom_pred_binary, 
         obsClass = TotalTreeCover_binom) %>% 
  mutate(predClass = replace(predClass, predClass == 0, "no trees"),
         predClass = replace(predClass, predClass == 1, "trees"),
         obsClass = replace(obsClass, obsClass == 0, "no trees"),
         obsClass = replace(obsClass, obsClass == 1, "trees")) %>% 
mutate(predClass = as.factor(predClass) ,
       obsClass = as.factor(obsClass))

# make matrix as a data.frame
# confMat <-  confusionMatrix(data = matData$predClass,
#                  reference = matData$obsClass)
ConfusionTableR::binary_visualiseR(
  train_labels = as.factor(matData$predClass),
  truth_labels = as.factor(matData$obsClass),
   class_label1 = "No trees",
   class_label2 = "Trees",
   quadrant_col1 = "wheat",
   quadrant_col2 = "darkgreen",
  text_col = "white"
)

Are there biases of the model predictions across year/lat/long?

if ( name_secondBestMod == "Intercept_Only") {
  print("The next best lambda model only contains one predictor (an intercept)")
  # plot misclassifications against Year
yearResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = jitter(Year), y = jitter(missClass)), alpha = .1) + 
  geom_hline(aes(yintercept = 0), lty = 2, col = "red") + 
  geom_smooth(aes(x = Year, y = missClass), method = "lm") + 
  xlab("Year") + 
  ylab("misclassifications") +
  ggtitle("from best lamba model")

# plot misclassifications against Lat
latResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = y, y = jitter(missClass)), alpha = .1) + 
  geom_hline(aes(yintercept = 0), lty = 2, col = "red") + 
  geom_smooth(aes(x = y, y = missClass)) + 
  xlab("Latitude") + 
  ylab("misclassifications") +
  ggtitle("from best lamba model")

# plot misclassifications against Long
longResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = x, y = jitter(missClass)), alpha = .1) + 
  geom_hline(aes(yintercept = 0), lty = 2, col = "red") + 
  geom_smooth(aes(x = x, y = missClass)) + 
  xlab("Longitude") + 
  ylab("misclassifications") +
  ggtitle("from best lamba model")
library(patchwork)
(yearResidMod_bestLambda ) / 
(  latResidMod_bestLambda ) /
(  longResidMod_bestLambda )
} else {

# plot misclassifications against Year
yearResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = jitter(Year), y = jitter(missClass)), alpha = .1) + 
  geom_hline(aes(yintercept = 0), lty = 2, col = "red") + 
  geom_smooth(aes(x = Year, y = missClass), method = "lm") + 
  xlab("Year") + 
  ylab("misclassifications") +
  ggtitle("from best lamba model")
yearResidMod_1seLambda <- ggplot(pred_glm1_1se) + 
  geom_point(aes(x = jitter(Year), y = jitter(missClass)), alpha = .1) + 
  geom_hline(aes(yintercept = 0), lty = 2, col = "red") + 
  geom_smooth(aes(x = Year, y = missClass), method = "lm") + 
  xlab("Year") + 
  ylab("misclassifications") +
  ggtitle(paste0("from ", name_secondBestMod))

# plot misclassifications against Lat
latResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = y, y = jitter(missClass)), alpha = .1) + 
  geom_hline(aes(yintercept = 0), lty = 2, col = "red") + 
  geom_smooth(aes(x = y, y = missClass)) + 
  xlab("Latitude") + 
  ylab("misclassifications") +
  ggtitle("from best lamba model")
latResidMod_1seLambda <- ggplot(pred_glm1_1se) + 
  geom_point(aes(x = y, y = jitter(missClass)), alpha = .1) + 
  geom_hline(aes(yintercept = 0), lty = 2, col = "red") + 
  geom_smooth(aes(x = y, y = missClass)) + 
  xlab("Latitude") + 
  ylab("misclassifications") +
  ggtitle(paste0("from ", name_secondBestMod))

# plot misclassifications against Long
longResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = x, y = jitter(missClass)), alpha = .1) + 
  geom_hline(aes(yintercept = 0), lty = 2, col = "red") + 
  geom_smooth(aes(x = x, y = missClass)) + 
  xlab("Longitude") + 
  ylab("misclassifications") +
  ggtitle("from best lamba model")
longResidMod_1seLambda <- ggplot(pred_glm1_1se) + 
  geom_point(aes(x = x, y = jitter(missClass)), alpha = .1) + 
  geom_hline(aes(yintercept = 0), lty = 2, col = "red") + 
  geom_smooth(aes(x = x, y = missClass)) + 
  xlab("Longitude") + 
  ylab("misclassifications") +
  ggtitle(paste0("from ", name_secondBestMod))

library(patchwork)
(yearResidMod_bestLambda + yearResidMod_1seLambda) / 
(  latResidMod_bestLambda + latResidMod_1seLambda) /
(  longResidMod_bestLambda + longResidMod_1seLambda)
}

Quantile plots - comparing “binary” observations in the testing datase to continuous model-predicted probabilities

Binning predictor variables into “Quantiles”and looking at the mean predicted probability for each quantile

response_vars <- c("TotalTreeCover_binom", "TotalTreeCover_binom_pred")
# get deciles for best lambda model 
if (length(prednames_fig) == 0) {
  print("The best lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1_deciles <- predvars2deciles(pred_glm1,
                                      response_vars = response_vars,
                                        pred_vars = prednames_fig, 
                                       cut_points = seq(0, 1, 0.01))
}
# get deciles for 1 SE lambda model 
if ( name_secondBestMod == "Intercept_Only") {
  print("The next best lambda model only contains one predictor (an intercept)")} else {
  pred_glm1_deciles_1se <- predvars2deciles(pred_glm1_1se,
                                      response_vars = response_vars,
                                        pred_vars = prednames_secondBestMod, 
                                       cut_points = seq(0, 1, 0.01))
  }

Below are quantile plots for the best lambda model (note that the predictor variables are scaled)

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {

# publication quality version
g3 <- decile_dotplot_pq(df = pred_glm1_deciles, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")), IQR = FALSE,
                        CI = FALSE
                        ) + ggtitle("Decile Plot")

g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")), dfRaw = pred_glm1, add_smooth = TRUE, deciles = FALSE)

  
if(save_figs) {
  # png(paste0("../../../Figures/CoverDatFigures/ figures/quantile_plots/quantile_plot_", response,  "_",ecoregion,".png"), 
  #    units = "in", res = 600, width = 5.5, height = 3.5 )
  #   print(g4)
  # dev.off()
}

g4
}

Below are quantile plots from the second best lambda model ()

if ( name_secondBestMod == "Intercept_Only") {
  print("The next best lambda model only contains one predictor (an intercept)")
  } else {

# publication quality version
g3 <- decile_dotplot_pq(pred_glm1_deciles_1se, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")), IQR = FALSE) + ggtitle("Decile Plot")

g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles_1se, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")), dfRaw = pred_glm1_1se, add_smooth = TRUE, deciles = FALSE)

  
if(save_figs) {
  # png(paste0("figures/quantile_plots/quantile_plot_", response,  "_",ecoregion,".png"), 
  #    units = "in", res = 600, width = 5.5, height = 3.5 )
  #   print(g4)
  # dev.off()
}

g4
}

Filtered Quantiles - comparing “binary” observations in the testing datase to continuous model-predicted probabilities

For the best lambda model

Filtered quantile plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower 20th percentiles of each predictor variable.

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1, 
                         response_vars = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")),
                         pred_vars = prednames_fig,
                         filter_var = TRUE,
                         filter_vars = prednames_fig,
                         cut_points = seq(0, 1, 0.01)) 
g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt, response = "TotalTreeCover_binom", 
                                xvars = prednames_fig)

if(save_figs) {
# jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
#      units = "in", res = 600, width = 5.5, height = 6 )
#   g 
# dev.off()
}
g
}

Filtered quantile figure with middle 2 deciles also shown

if ( name_secondBestMod == "Intercept_Only") {
  print("The next best lambda model only contains one predictor (an intercept)")
} else {
pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1, 
                         response_vars = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")),
                         pred_vars = prednames_fig,
                         filter_vars = prednames_fig,
                         filter_var = TRUE,
                         add_mid = TRUE,
                         cut_points = seq(0, 1, 0.01))

g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, response = "TotalTreeCover_binom", 
                                xvars = prednames_fig)

if(save_figs) {
# jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
#      units = "in", res = 600, width = 5.5, height = 6)
#   g 
# dev.off()
}
g
}

For the second best lambda model ()

Filtered ‘Quantile’ plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower 20th percentiles of each predictor variable.

if (length(prednames_secondBestMod) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1_1se, 
                         response_vars = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")),
                         pred_vars = prednames_secondBestMod,
                         filter_var = TRUE,
                         filter_vars = prednames_secondBestMod,
                         cut_points = seq(0, 1, 0.01)) 
g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt, response = "TotalTreeCover_binom", 
                                xvars = prednames_fig)

if(save_figs) {
# jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
#      units = "in", res = 600, width = 5.5, height = 6 )
#   g 
# dev.off()
}
g
}

Filtered quantile figure with middle 2 deciles also shown

if (length(prednames_secondBestMod) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1, 
                         response_vars = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")),
                         pred_vars = prednames_secondBestMod,
                         filter_vars = prednames_secondBestMod,
                         filter_var = TRUE,
                         add_mid = TRUE,
                         cut_points = seq(0, 1, 0.01))

g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, response = "TotalTreeCover_binom", 
                                xvars = prednames_fig)


if(save_figs) {
# jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
#      units = "in", res = 600, width = 5.5, height = 6 )
#   g 
# dev.off()
}
g
}

Quantile plots - comparing “binary” testing data to “binary” model-predictions (i.e. converting the continuous model predictions to discrete values (1 or 0) based on the previously-identified optimal threshold)

response_vars <- c("TotalTreeCover_binom", "TotalTreeCover_binom_pred_binary")
# get deciles for best lambda model 
if (length(prednames_fig) == 0) {
  print("The best lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1_deciles <- predvars2deciles(pred_glm1,
                                      response_vars = response_vars,
                                        pred_vars = prednames_fig, 
                                       cut_points = seq(0, 1, 0.01))
}
# get deciles for 1 SE lambda model 
if ( name_secondBestMod == "Intercept_Only") {
  print("The next best lambda model only contains one predictor (an intercept)")} else {
  pred_glm1_deciles_1se <- predvars2deciles(pred_glm1_1se,
                                      response_vars = response_vars,
                                        pred_vars = prednames_secondBestMod, 
                                       cut_points = seq(0, 1, 0.01))
  }

Below are quantile plots for the best lambda model (note that the predictor variables are scaled)

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
# ####  code to generate plots analagous to "decile plots" by hand, do double check that they're suitable for use w/ binary predictions 
#   pred_glm1 %>% 
#     #slice_sample(n = 10000) %>% 
#     pivot_longer(cols = all_of(prednames_fig)) %>% 
#     select(Year, x, y, TotalTreeCover_binom, TotalTreeCover_binom_pred_binary, name, value) %>% 
#     ggplot() + 
#     facet_wrap(~name, scales = "free") + 
#     geom_point(aes(x = value, y = jitter(TotalTreeCover_binom)), alpha = .5) + 
#     geom_point(aes(x = value, y = jitter(TotalTreeCover_binom_pred_binary)), col = "blue", alpha = .5) + 
#     geom_smooth(aes(x = value, y = TotalTreeCover_binom), col = "black") + 
#     geom_smooth(aes(x = value, y = TotalTreeCover_binom_pred_binary), col = "blue") 
#                  
  
  
  #####
# publication quality version
g3 <- decile_dotplot_pq(df = pred_glm1_deciles, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred_binary")), IQR = FALSE,
                        CI = FALSE
                        ) + ggtitle("Decile Plot")

g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred_binary")), dfRaw = pred_glm1, add_smooth = TRUE, deciles = FALSE)

  
if(save_figs) {
  # png(paste0("../../../Figures/CoverDatFigures/ figures/quantile_plots/quantile_plot_", response,  "_",ecoregion,".png"), 
  #    units = "in", res = 600, width = 5.5, height = 3.5 )
  #   print(g4)
  # dev.off()
}

g4
}

Below are quantile plots from the second best lambda model ()

if ( name_secondBestMod == "Intercept_Only") {
  print("The next best lambda model only contains one predictor (an intercept)")
  } else {

# publication quality version
g3 <- decile_dotplot_pq(pred_glm1_deciles_1se, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred_binary")), IQR = FALSE) + ggtitle("Decile Plot")

g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles_1se, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred_binary")), dfRaw = pred_glm1_1se, add_smooth = TRUE, deciles = FALSE)

  
if(save_figs) {
  # png(paste0("figures/quantile_plots/quantile_plot_", response,  "_",ecoregion,".png"), 
  #    units = "in", res = 600, width = 5.5, height = 3.5 )
  #   print(g4)
  # dev.off()
}

g4
}

Show model RMSE w/in each quantile

# get deciles for best lambda model 
if (length(prednames_fig) == 0) {
  print("The best lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1_deciles %>% 
    ggplot(aes(x = mean_value, y = RMSE)) +
    facet_wrap(~name, scales = "free_x") +
    geom_point(alpha = .2, size = .5) + 
    geom_smooth(lwd = .5) + 
    xlab("Scaled predictor value") + 
    ggtitle("RMSE by decile for bestLambda model")
}

# get deciles for 1 SE lambda model 
if (length(prednames_secondBestMod) == 0) {
  print("The 1SE (or 1/2 SE) lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1_deciles_1se %>% 
    ggplot(aes(x = mean_value, y = RMSE)) +
    facet_wrap(~name, scales = "free_x") +
    geom_point(alpha = .2, size = .5) + 
    geom_smooth(lwd = .5) + 
    xlab("Scaled predictor value") + 
    ggtitle(paste0("RMSE by decile for ", name_secondBestMod, "model"))
}